/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.quality; import java.util.*; /********************************************* * This finds a ranking of all known pages that * minimizes the Kendall Tau distance between the * full-ranking and each component ranking. * * @author Mike Cafarella *********************************************/ public class MarkovRankSolver { // Remember all the ranking objects Vector orderings = new Vector(); boolean solved = false; TreeMap fullRanking = new TreeMap(); /** * The MarkovRankSolver takes a bunch of rankers. When * there's a call to solveRanking(), we return an array * of all the results from those rankers. */ public MarkovRankSolver() { } /** * Add an ordering of items to the MRS' working set. * You can call this function as much as you like * prior to calling "solve". */ public void addOrdering(Object[] ordering) { orderings.add(ordering); } /** * Solving the Markov chain requires N^2 space, * where N is the number of unique items returned * by the list rankers. Keep this in mind! */ public void solveRanking() { // // 1. Get all known states from the orderings. // Uniquify them. Then build a state set. // TreeSet stateSet = new TreeSet(); Vector allPositions = new Vector(); for (Enumeration e = orderings.elements(); e.hasMoreElements(); ) { Object ordering[] = (Object[]) e.nextElement(); TreeMap curItemPositions = new TreeMap(); for (int i = 0; i < ordering.length; i++) { stateSet.add(ordering[i]); curItemPositions.put(ordering[i], new Integer(i)); } allPositions.add(curItemPositions); } int s = 0; Object states[] = new Object[stateSet.size()]; for (Iterator it = stateSet.iterator(); it.hasNext(); s++) { states[s] = it.next(); } // // 2. Build connectivity matrix. Each cell // has "1" or "0" in it. // byte transitions[][] = new byte[states.length][]; for (int i = 0; i < transitions.length; i++) { transitions[i] = new byte[states.length]; } // // 3. Iterate through each elt in the lower-left triangle. // Also fill in value for its dual in the upper-right. // for (int i = 0; i < states.length; i++) { for (int j = 0; j < i; j++) { // Find what a majority of rankers think. int item1Better = 0, item2Better = 0; for (Enumeration e = allPositions.elements(); e.hasMoreElements(); ) { TreeMap curItemPositions = (TreeMap) e.nextElement(); Integer pos1 = (Integer) curItemPositions.get(states[i]); Integer pos2 = (Integer) curItemPositions.get(states[j]); if (pos1 != null && pos2 != null) { if (pos1.intValue() < pos2.intValue()) { item1Better++; } else if (pos1.intValue() > pos2.intValue()) { item2Better++; } } } // // If there's a majority to be found, fill in the // transition matrix. We fill in a "1" when we want // to make the transition from i to j. That is, when // the majority thinks the rank at j is smaller than the // rank at i, we make the transit. // if (((item1Better > 0) || (item2Better > 0)) && (item1Better != item2Better)) { transitions[i][j] = (item1Better < item2Better) ? (byte) 1 : (byte) 0; transitions[j][i] = (byte) (1 - transitions[i][j]); } } } // // To maintain the final sorted list... // float lastStateDist[] = new float[states.length]; final float curStateDist[] = new float[states.length]; int numTransitions[] = new int[states.length]; int totalStates = states.length, numSortedStates = 0; boolean removedState[] = new boolean[states.length]; for (int i = 0; i < removedState.length; i++) { removedState[i] = false; } // // Loop until we rank all items // while (numSortedStates < totalStates) { // // 4. Find the total number of nonzero transitions // from each state // for (int i = 0; i < states.length; i++) { numTransitions[i] = 0; if (! removedState[i]) { for (int j = 0; j < states.length; j++) { if (! removedState[j]) { numTransitions[i] += transitions[i][j]; } } } } // // 5. If there are zero transitions from a given state // (that is, it's a sink), then give it a self-loop // transition entry. This means the sink node will // eventually rise to stationary likelihood of 100%. // for (int i = 0; i < numTransitions.length; i++) { if (! removedState[i] && numTransitions[i] == 0) { transitions[i][i] = 1; numTransitions[i] = 1; } } // // Build likelihoods for each state // for (int i = 0; i < states.length; i++) { lastStateDist[i] = (1.0f / (totalStates - numSortedStates)); curStateDist[i] = (1.0f / (totalStates - numSortedStates)); } // // 6. Find the stationary distribution iteratively. // // REMIND - mjc - in the future we'd like to stop // iterating based on convergence criteria rather than // a hard-coded number of loops // for (int k = 0; k < (2 * states.length); k++) { // For every target state.... for (int i = 0; i < states.length; i++) { // Init target state's likelihood to zero. curStateDist[i] = 0; // Iterate through every source state... if (! removedState[i]) { for (int j = 0; j < states.length; j++) { // // If we transit from the current source to // the current target, then adjust the target // to have its share of the source's likelihood. // if (! removedState[j] && transitions[j][i] == 1) { curStateDist[i] += (lastStateDist[j] / numTransitions[j]); } } } } // Now copy the "cur" value to "last" values. System.arraycopy(curStateDist, 0, lastStateDist, 0, curStateDist.length); } // // 7. Now detect and remove sinks. Place in sorted // list. Mark removed states in the "removedState[]" // array // TreeSet stateSorter = new TreeSet(new Comparator() { public int compare(Object o1, Object o2) { Integer pos1 = (Integer) o1; Integer pos2 = (Integer) o2; double score1 = curStateDist[pos1.intValue()]; double score2 = curStateDist[pos2.intValue()]; if (score1 > score2) { return -1; } else if (score1 == score2) { return 0; } else { return 1; } } } ); for (int i = 0; i < states.length; i++) { if (! removedState[i]) { stateSorter.add(new Integer(i)); } } // // 8. Put the top-ranked items into the sorted result list // until we find an "end-of-sinks" break. At that point // we restart the Markov-solving. // int numStatesMoved = 0; for (Iterator it = stateSorter.iterator(); it.hasNext(); ) { int index = ((Integer) it.next()).intValue(); float rating = curStateDist[index]; // // When we notice a very large drop in ratings, we // assume it's because we've found a sink region. // Remove the items in the sink and restart computation. // if ((numStatesMoved > 0) && (rating == 0 || rating < ((1.0 / (totalStates - numSortedStates)) / 10000.0))) { break; } else { fullRanking.put(states[index], new Integer(numSortedStates)); numSortedStates++; removedState[index] = true; numStatesMoved++; } } } solved = true; } /** * Find the position in the full list for the given * object. */ public int getPos(Object obj) { if (! solved) { throw new IllegalArgumentException("Must call solveRanking() first."); } Integer pos = (Integer) fullRanking.get(obj); if (pos == null) { throw new IllegalArgumentException("Unknown item obj."); } return pos.intValue(); } /** * Return total number of states in final ranking */ public int getNumStates() { return fullRanking.size(); } /** * Compute the Kendall Tau distance between a given list * of ListItem objects and the current full ranking. * Must be called after solveRanking(). */ public double getKendallTauDistance(Object testList[], boolean normalized) { if (! solved) { throw new IllegalArgumentException("Must call solveRanking() first."); } int misOrderings = 0, maxOrderings = 0; // // Go through all pairs of elts in the testList. See // if they are consistent with the fullList. // for (int i = 0; i < testList.length; i++) { for (int j = i + 1; j < testList.length; j++) { int pos1 = getPos(testList[i]); int pos2 = getPos(testList[j]); if (pos1 > pos2) { misOrderings++; } maxOrderings++; } } if (normalized) { if (maxOrderings == 0) { return 0; } return misOrderings / (1.0 * maxOrderings); } else { return misOrderings; } } /** * Test the rank-solver */ public static void main(String argv[]) throws NumberFormatException { if (argv.length < 1) { System.out.println("Usage: java net.nutch.quality.MarkovRankSolver <maxStates> [-seed <seed>]"); return; } int maxStates = Integer.parseInt(argv[0]); long seed = new Random().nextInt(); boolean usedSeed = false; if (argv.length > 1) { for (int i = 1; i < argv.length; i++) { if ("-seed".equals(argv[i])) { seed = Long.parseLong(argv[i + 1]); usedSeed = true; i++; } } } if (usedSeed) { System.out.println("Using seed: " + seed); } else { System.out.println("Seed: " + seed); } // // Finally, create the markov-model solver // MarkovRankSolver solver = new MarkovRankSolver(); Random rand = new Random(seed); // Build state set Integer states[] = new Integer[maxStates]; for (int i = 0; i < states.length; i++) { states[i] = new Integer(i); } // Build 10 slightly-different orderings for (int i = 0; i < 10; i++) { Integer ordering[] = new Integer[maxStates]; for (int j = 0; j < ordering.length; j++) { ordering[j] = states[(j + (Math.abs(rand.nextInt()) % 2)) % ordering.length]; System.out.print(ordering[j] + " "); } System.out.println(); solver.addOrdering(ordering); } // And a very boring one that contains each item Integer ordering[] = new Integer[states.length]; for (int i = 0; i < ordering.length; i++) { ordering[i] = states[i]; } solver.addOrdering(ordering); System.out.println("About to solve problem..."); solver.solveRanking(); System.out.println("-----------------------------------"); for (int i = 0; i < states.length; i++) { System.out.println(states[i] + "\t\t" + solver.getPos(states[i])); } } }